Data Processing
# Load libraries needed
library(tidyverse)
library(lubridate)
License_Application <- read_csv("data/License_Applications.csv")
Warning: One or more parsing issues, see `problems()` for details
Rows: 420857 Columns: 25
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (21): Application ID, License Number, License Type, Application or Renewal, Business Name, Status, Start Date, End Date, Temp Op Letter Expiration, License ...
dbl (2): Longitude, Latitude
lgl (1): Active Vehicles
date (1): Temp Op Letter Issued
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(License_Application);names(License_Application)
[1] 420857 25
[1] "Application ID" "License Number" "License Type" "Application or Renewal" "Business Name"
[6] "Status" "Start Date" "End Date" "Temp Op Letter Issued" "Temp Op Letter Expiration"
[11] "License Category" "Application Category" "Building Number" "Street" "Street 2"
[16] "Unit Type" "Unit" "Description" "City" "State"
[21] "Zip" "Contact Phone" "Longitude" "Latitude" "Active Vehicles"
License_App <- License_Application %>% filter(State == "NY") %>%
mutate(Start_date = mdy(`Start Date`), End_date = mdy(`End Date`), City = tolower(City)) %>%
dplyr::select(Start_date, End_date, `Application ID`, `License Number`, `License Type`,
`Application or Renewal`,`Business Name`,Status,`License Category`,`Application Category`,`Building Number`,Street,City,State,Zip,Longitude,Latitude) %>% filter(Start_date >= as.Date("2017-01-01")) %>%
arrange(Start_date)
Five boroughs in new york are Brooklyn, Bronx, Manhattan,Queens, State Island. Queens indicated as “queens village” or “queens vlg”
NYC_License <- License_App %>% filter(City %in% c("bronx","brooklyn","new york",
"staten island", "manhattan", "queens village", "queens vlg"))
#Category count by month
License_by_month <- NYC_License %>% group_by(month = floor_date(Start_date,"month"),`Application or Renewal`,`License Category`) %>% summarise(cnt = n()) %>%
arrange(month)
`summarise()` has grouped output by 'month', 'Application or Renewal'. You can override using the `.groups` argument.
#Category_cnt of License Application
Category_cnt <- NYC_License %>% group_by(year = floor_date(Start_date,"year"),`Application or Renewal`,`License Category`) %>% summarise(cnt = n()) %>%
arrange(year, desc(cnt))
`summarise()` has grouped output by 'year', 'Application or Renewal'. You can override using the `.groups` argument.
#total_app_by_year <- License_App %>% group_by(year = floor_date(Start_date,"year"),`Application or Renewal`) %>% summarise(cnt = n()) %>%
# arrange(year)
#total_app
total_app_by_month <-
License_App %>% group_by(month = floor_date(Start_date,"month"),`Application or Renewal`) %>% summarise(cnt = n()) %>%
arrange(month)
`summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
ggplot(total_app_by_month, aes(x = month, y = cnt)) +
geom_line(aes(color = `Application or Renewal`)) +
labs(y = "Number of License renewed and applied in month") +
scale_x_date(breaks = "4 month") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
Covid_19_raw <- read_csv("data/COVID-19.csv")
Rows: 718 Columns: 62
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): DATE_OF_INTEREST
dbl (61): CASE_COUNT, probable_case_count, HOSPITALIZED_COUNT, DEATH_COUNT, DEATH_COUNT_PROBABLE, CASE_COUNT_7DAY_AVG, all_case_count_7day_avg, HOSP_COUNT_7DAY_A...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(Covid_19_raw);names(Covid_19_raw)
[1] 718 62
[1] "DATE_OF_INTEREST" "CASE_COUNT" "probable_case_count" "HOSPITALIZED_COUNT"
[5] "DEATH_COUNT" "DEATH_COUNT_PROBABLE" "CASE_COUNT_7DAY_AVG" "all_case_count_7day_avg"
[9] "HOSP_COUNT_7DAY_AVG" "DEATH_COUNT_7DAY_AVG" "all_death_count_7day_avg" "BX_CASE_COUNT"
[13] "bx_probable_case_count" "BX_HOSPITALIZED_COUNT" "BX_DEATH_COUNT" "bx_probable_death_count"
[17] "BX_CASE_COUNT_7DAY_AVG" "bx_all_case_count_7day_avg" "BX_HOSPITALIZED_COUNT_7DAY_AVG" "BX_DEATH_COUNT_7DAY_AVG"
[21] "bx_all_death_count_7day_avg" "BK_CASE_COUNT" "bk_probable_case_count" "BK_HOSPITALIZED_COUNT"
[25] "BK_DEATH_COUNT" "bk_probable_death_count" "BK_CASE_COUNT_7DAY_AVG" "bk_all_case_count_7day_avg"
[29] "BK_HOSPITALIZED_COUNT_7DAY_AVG" "BK_DEATH_COUNT_7DAY_AVG" "bk_all_death_count_7day_avg" "MN_CASE_COUNT"
[33] "mn_probable_case_count" "MN_HOSPITALIZED_COUNT" "MN_DEATH_COUNT" "mn_probable_death_count"
[37] "MN_CASE_COUNT_7DAY_AVG" "mn_all_case_count_7day_avg" "MN_HOSPITALIZED_COUNT_7DAY_AVG" "MN_DEATH_COUNT_7DAY_AVG"
[41] "mn_all_death_count_7day_avg" "QN_CASE_COUNT" "qn_probable_case_count" "QN_HOSPITALIZED_COUNT"
[45] "QN_DEATH_COUNT" "qn_probable_death_count" "QN_CASE_COUNT_7DAY_AVG" "qn_all_case_count_7day_avg"
[49] "QN_HOSPITALIZED_COUNT_7DAY_AVG" "QN_DEATH_COUNT_7DAY_AVG" "qn_all_death_count_7day_avg" "SI_CASE_COUNT"
[53] "si_probable_case_count" "SI_HOSPITALIZED_COUNT" "SI_DEATH_COUNT" "si_probable_death_count"
[57] "SI_CASE_COUNT_7DAY_AVG" "si_all_case_count_7day_avg" "SI_HOSPITALIZED_COUNT_7DAY_AVG" "SI_DEATH_COUNT_7DAY_AVG"
[61] "si_all_death_count_7day_avg" "INCOMPLETE"
Covid_19 <- Covid_19_raw %>% mutate(Date = mdy(DATE_OF_INTEREST), .before = DATE_OF_INTEREST) %>%
dplyr::select(Date, CASE_COUNT,probable_case_count, HOSPITALIZED_COUNT,DEATH_COUNT,
DEATH_COUNT_PROBABLE,CASE_COUNT_7DAY_AVG,all_case_count_7day_avg,
HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,all_death_count_7day_avg)
Join two tables
#covid 19 count by month, case count and death count,Date, CASE_COUNT,probable_case_count, HOSPITALIZED_COUNT,DEATH_COUNT,DEATH_COUNT_PROBABLE
Covid_by_month <- Covid_19 %>% group_by(month = floor_date(Date,"month")) %>%
summarise(monthly_case_count = sum(CASE_COUNT), monthly_death = sum(DEATH_COUNT), monthly_hospitalized = sum(HOSPITALIZED_COUNT),
monthly_case_probable = sum(probable_case_count), monthly_death_probable = sum(DEATH_COUNT_PROBABLE)) %>%
arrange(desc(month))
Category_by_month <- NYC_License %>%
group_by(month = floor_date(Start_date,"month"),
`Application or Renewal`,`License Category`) %>%
summarise(License_cnt = n()) %>%
arrange(month)
`summarise()` has grouped output by 'month', 'Application or Renewal'. You can override using the `.groups` argument.
Month_Application_Covid <- Category_by_month %>%
inner_join(Covid_by_month, by = c("month" = "month"))
1.covid cases vs total applications 2.covid cases vs category applications
if (!require("plotly")) install.packages("plotly")
if (!require("viridis")) install.packages("viridis")
if (!require("hrbrthemes")) install.packages("hrbrthemes")
library(plotly)
library(viridis)
library(hrbrthemes)
data %>% arrange(desc(License_cnt))
#covid cases vs category applications
#The dataset is provided in the gapminder library
#data <- Month_Application_Covid %>% filter(`Application or Renewal` == "Application", `License Category` %in% c("Home Improvement Contractor"))
#p <- data %>%
# mutate(text = paste("Date:",month,"\nCovid_19 cases:",monthly_case_count,"\nApplications:",License_cnt,"\nLicense Category:",`License Category`, sep = "")) %>%
# ggplot(aes(x=month,y=monthly_case_count,size = License_cnt, color = `License Category`,text = text))+
# geom_point(alpha=0.7) +
# geom_line(aes(y = monthly_case_count)) +
# scale_size(range = c(1.4, 19), name="Population (M)") +
# scale_color_viridis(discrete=TRUE, guide=FALSE) +
# theme_ipsum() +
# theme(legend.position="none")
#pp <- ggplotly(p, tooltip="text")
# pp
#covid cases vs total applications
data2 <- Month_Application_Covid %>% filter(`Application or Renewal` == "Application") %>%
group_by(month) %>%
summarise(applications = sum(License_cnt), monthly_case_count=mean(monthly_case_count))
data2
p2 <- data2 %>%
mutate(text = paste("Date:",month,"\nCovid_19 cases:",monthly_case_count,"\nApplications:",applications, sep = "")) %>%
ggplot(aes(x=month,y=monthly_case_count,size = applications, color = "lightgreen",text = text))+
geom_point(alpha=0.7) +
geom_line(aes(y = monthly_case_count)) +
scale_size(range = c(1.4, 19), name="Population (M)") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_ipsum() +
theme(legend.position="none")
pp2 <- ggplotly(p2, tooltip="text")
pp2
NA
NA
if (!require("shiny")) install.packages("shiny")
library(shiny)
data <- Month_Application_Covid %>% filter(`Application or Renewal` == "Application") %>%
mutate(text = paste("Date:",month,"\nCovid_19 cases:",monthly_case_count,"\nApplications:",License_cnt,"\nLicense Category:",`License Category`, sep = "")) %>%
var <- unique(License_by_month$`License Category`)
Error in Month_Application_Covid %>% filter(`Application or Renewal` == :
could not find function "%>%<-"